Problem Set 5

Author

Rajvi Jasani

GitHub Repository

This is the link to my GitHub repository https://github.com/rajvijasani/STATS506-Problem-Set-5.git

Problem 1 - OOP Programming

library(Rcpp)

# including the C++ function to calculate GCD
cppFunction("
int gcd(int x, int y) {
 return  std::gcd(x, y);
}")

# including the C++ function to calculate LCM
cppFunction("
int lcm(int x, int y) {
 return  std::lcm(x, y);
}")

# class definition
setClass("rational", slots = c(a = "numeric", b = "numeric"))

# constructor definition
rational <- function(a, b) {
  # checking if inputs are not numeric
  if (!is.numeric(a) || !is.numeric(b)) {
    stop("Inputs must be numeric")
  }
  # converting numeric inputs to integers
  a <- as.integer(a)
  b <- as.integer(b)
  return(new("rational", a = a, b = b))
}

# checking for validity of inputs
invisible(setValidity("rational", function(object) {
  # denominator cannot be zero
  if (as.numeric(object@b) == 0) {
    stop("Denominator can't be zero")
  }
  return(TRUE)
}))

# defining a show method for the rational class
setMethod("show", "rational", function(object) {
  # displaying the rational in mixed fraction form
  cat(object@a, "/", object@b)
  return(invisible(object))
})

# setting and defining a new method to simplify the input rational
invisible(setGeneric("simplify", function(object) {
  standardGeneric("simplify")
}))
setMethod("simplify", "rational", function(object) {
  # calculating the gcd of the numerator and denominator
  gcd_val <- gcd(object@a, object@b)
  # dividing the numerator and denominator by their gcd
  # to simply the fraction
  object@a <- object@a / gcd_val
  object@b <- object@b / gcd_val
  return(object)
})

# setting and defining a new method to find the quotient in decimal form
invisible(setGeneric("quotient", function(object, digits = 5) {
  standardGeneric("quotient")
}))
setMethod("quotient", "rational", function(object, digits = 5) {
  # printing the quotient rounded up to the digits given as input
  print(round(object@a / object@b, digits))
  # returning the actual quotient without rounding off
  return(invisible(object@a / object@b))
})

# defining addition operation for 2 rational objects
setMethod("+", signature(e1 = "rational", e2 = "rational"), function(e1, e2) {
  # finding the lcm of denominators
  b <- lcm(e1@b, e2@b)
  # multiplying both numerators by remainder of division
  # of lcm by corresponding denominator
  # (basic method in math)
  # and adding them to get the new numerator
  a <- (e1@a * (b / e1@b)) + (e2@a * (b / e2@b))
  # returning the simplified fraction
  return(simplify(rational(a = a, b = b)))
})

# defining subtraction operation for 2 rational objects
setMethod("-", signature(e1 = "rational", e2 = "rational"), function(e1, e2) {
  # finding the lcm of denominators
  b <- lcm(e1@b, e2@b)
  # multiplying both numerators by remainder of division
  # of lcm by corresponding denominator
  # (basic method in math)
  # and subtracting them to get the new numerator
  a <- (e1@a * (b / e1@b)) - (e2@a * (b / e2@b))
  # returning the simplified fraction
  return(simplify(rational(a = a, b = b)))
})

# defining multiplication operation for 2 rational objects
setMethod("*", signature(e1 = "rational", e2 = "rational"), function(e1, e2) {
  # multiplying the numerators
  a <- e1@a * e2@a
  # multiplying the denominators
  b <- e1@b * e2@b
  # returning the simplified fraction
  return(simplify(rational(a = a, b = b)))
})

# defining division operation for 2 rational objects
setMethod("/", signature(e1 = "rational", e2 = "rational"), function(e1, e2) {
  # if either of the numerators is a zero,
  # the result will be 0/1
  # this also avoids errors when inverting one fraction
  if (e1@a == 0 || e2@a == 0) {
    return(rational(a = 0, b = 1))
  }
  # inverting 2nd fraction and performing
  # multiplication of numerators and denominators
  a <- e1@a * e2@b
  b <- e1@b * e2@a
  # returning the simplified fraction
  return(simplify(rational(a = a, b = b)))
})
r1 <- rational(a = 24, b = 6)
r2 <- rational(a = 7, b = 230)
r3 <- rational(a = 0, b = 4)
print(r1)
24 / 6
print(r3)
0 / 4
print(r1 + r2)
927 / 230
print(r1 - r2)
913 / 230
print(r1 * r2)
14 / 115
print(r1 / r2)
920 / 7
print(r1 + r3)
4 / 1
print(r1 * r3)
0 / 1
print(r2 / r3)
0 / 1
quotient(r1)
[1] 4
quotient(r2)
[1] 0.03043
quotient(r2, digits = 3)
[1] 0.03
quotient(r2, digits = 3.14)
[1] 0.03
tryCatch({
  quotient(r2, digits = "avocado")
}, error = function(e) {
  message("Error: ", e$message)
})
Error: non-numeric argument to mathematical function
q2 <- quotient(r2, digits = 3)
[1] 0.03
print(q2)
[1] 0.03043478
quotient(r3)
[1] 0
print(simplify(r1))
4 / 1
print(simplify(r2))
7 / 230
print(simplify(r3))
0 / 1
# Checking when denominator is zero
tryCatch({
  print(rational(a = 1, b = 0))
}, error = function(e) {
  message("Error: ", e$message)
})
Error: Denominator can't be zero
# Checking when denominator is non numeric
tryCatch({
  print(rational(a = 1, b = "hello"))
}, error = function(e) {
  message("Error: ", e$message)
})
Error: Inputs must be numeric
# Checking when numerator is non numeric
tryCatch({
  print(rational(a = "world", b = 4))
}, error = function(e) {
  message("Error: ", e$message)
})
Error: Inputs must be numeric

Problem 2 - plotly

library(ggplot2)
library(plotly)
Warning: package 'plotly' was built under R version 4.4.2
df <- read.csv("data/df_for_ml_improved_new_market.csv")
# combining columns of different genre into one (opposite of factoring)
df_long <- df |>
  reshape(
    varying = grep("^Genre", names(df), value = TRUE),
    v.names = "Presence",
    timevar = "Genre",
    times = grep("^Genre", names(df), value = TRUE),
    direction = "long"
  )
df_long <- df_long[df_long$Presence == 1, ]

# counting occurrences of each combination of year and Genre
df_genre_distribution <- as.data.frame(table(df_long$year, df_long$Genre))
colnames(df_genre_distribution) <- c("year", "Genre", "n")

plot1 <- df_genre_distribution |>
  plot_ly(
    x = ~ year,
    y = ~ n,
    color = ~ Genre,
    type = "bar",
    text = ~ paste("Year:", year, "<br>Genre:", Genre, "<br>Count:", n),
    hoverinfo = "text"
  ) |>
  layout(
    title = "Genre Distribution Over Years",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Count"),
    barmode = "stack"
  )

plot1

A stacked bar plot with different genres colored differently can help easily spot the trend. It is quite clear from this graph that the distribution of genre of sales changes over the years but differs from genre to genre. At the beginning, the sales for each genre were quite low. With time, we see there is an increase in sales of sculptures and photographs. However, they both seem to have a steady amount of sales in the last half, between the years 2005-2010. The other genres also have increase in sales but the count isn’t as major as photography and sculptures. One interesting observation I noticed was that the sales of paintings genre looks almost equal to other genre. This can be verified by looking at the count for each by hovering over them. This is where plotly’s inter-activeness comes in handy.

# overall average sales price over years
df_avg_price <- aggregate(price_usd ~ year,
                          data = df,
                          FUN = mean,
                          na.rm = TRUE)

# genre-wise average sales price over years
df_genre_price <- aggregate(price_usd ~ year + Genre,
                            data = df_long,
                            FUN = mean,
                            na.rm = TRUE)


# plot for overall average sales price over years
plot2 <- plot_ly(
  data = df_avg_price,
  x = ~ year,
  y = ~ price_usd,
  type = 'scatter',
  mode = 'lines',
  name = 'Average Price (Overall)',
  line = list(color = 'black')
)

genre_colors <- c('red', 'green', 'purple', 'orange', 'cyan')

# creating a separate trace for each genre and adding it to the overall plot
for (i in 1:length(unique(df_genre_price$Genre))) {
  genre <- unique(df_genre_price$Genre)[i]
  plot2 <- plot2 |>
    add_trace(
      data = subset(df_genre_price, Genre == genre),
      x = ~ year,
      y = ~ price_usd,
      type = 'scatter',
      mode = 'lines',
      name = paste('Average Price (', genre, ')', sep = ""),
      line = list(color = genre_colors[i], width = 1)
    )
}

# layout to adjust title and axis labels
plot2 <- plot2 |>
  layout(
    title = "Average Sales Price in USD Over Time and by Genre",
    xaxis = list(title = "Year"),
    yaxis = list(title = "Average Price (USD)"),
    legend = list(title = list(text = "Genre"))
  )

plot2

The line plot is the simplest and most informative graph to look at a trend. Looking at the graph of overall average, we see that there is an overall increase in average sales price over the years until 2008 after which we can notice an overall decline. The increase and decrease in the prices is not constant and varies over period of years. The genre-wise plots show the demand of each genre through their sale prices over years. We can identify that some genres are more susceptible to market trends/popularity, like Photography and Print, while others are more stable. When there is an increase in demand of a particular genre, like Photography peaking in 2008, the overall sale prices also increased dramatically. Similarly, the rise and falls seen in overall average line can be justified and understood by looking at the genre-wise lines.

Attribution of Source: Used ChatGPT for help with reshape, aggregate and plot_ly syntax.

Problem 3 - data.table

library(data.table)
library(nycflights13)
flights <- data.table(flights)

Departure

airports <- data.table(airports)
flights[, .(mean_delay = round(mean(dep_delay, na.rm = TRUE), 3),
            median_delay = median(dep_delay, na.rm = TRUE)), by = origin][airports, on = .(origin = faa), nomatch = NULL][, .(name, mean_delay, median_delay)][order(-mean_delay)]
                  name mean_delay median_delay
                <char>      <num>        <num>
1: Newark Liberty Intl     15.108           -1
2: John F Kennedy Intl     12.112           -1
3:          La Guardia     10.347           -3

Arrival

flights_modified <- flights[, .(
  mean_delay = round(mean(arr_delay, na.rm = TRUE), 3),
  median_delay = median(arr_delay, na.rm = TRUE),
  num_flights = .N
), by = dest][num_flights >= 10]
# created a new data.table called flights_modified to right join on airports
# there are a few entries not included in the airports data
# but are present in the flights data
# thus right join is required to include these entries.
# we use faa code to represent these airports instead of name
airports[flights_modified, on = .(faa = dest), nomatch = NA][, name := ifelse(is.na(name), faa, name)][, .(name, mean_delay, median_delay)][order(-mean_delay)]
                          name mean_delay median_delay
                        <char>      <num>        <num>
  1:     Columbia Metropolitan     41.764         28.0
  2:                Tulsa Intl     33.660         14.0
  3:         Will Rogers World     30.619         16.0
  4:      Jackson Hole Airport     28.095         15.0
  5:             Mc Ghee Tyson     24.069          2.0
 ---                                                  
 98:       Seattle Tacoma Intl     -1.099        -11.0
 99:             Honolulu Intl     -1.365         -7.0
100:                       STT     -3.836         -9.0
101: John Wayne Arpt Orange Co     -7.868        -11.0
102:         Palm Springs Intl    -12.722        -13.5
planes <- data.table(planes)
# converting from minutes to hours
# grouping by model as different tail number planes can be of same model
flights[, time := air_time / 60][, flight_speed := distance / time][planes, on = .(tailnum), nomatch = NULL][, .(avg_speed = mean(flight_speed, na.rm = TRUE), num_flights = .N), by = model][avg_speed == max(avg_speed, na.rm = TRUE)]
     model avg_speed num_flights
    <char>     <num>       <int>
1: 777-222  482.6254           4